Client.csv
Disabilities.csv
EmploymentEducation.csv
Enrollment.csv
Exit.csv
HealthAndDv.csv
IncomeBenefits.csv
Services.csv
setwd("C:/Users/Kyu/Google Drive/Hackathon/data")
Client <- read.csv("Client.csv", na.strings=c("", " ", "NULL", NA), stringsAsFactors=FALSE)
Disabilities <- read.csv("Disabilities.csv", na.strings=c("", " ", "NULL", NA))
EmploymentEducation <- read.csv("EmploymentEducation.csv", na.strings=c("", " ", "NULL", NA))
Enrollment <- read.csv("Enrollment.csv", na.strings=c("", " ", "NULL", NA))
Exit <- read.csv("Exit.csv", na.strings=c("", " ", "NULL", NA))
HealthAndDV <- read.csv("HealthAndDV.csv", na.strings=c("", " ", "NULL", NA))
IncomeBenefits <- read.csv("IncomeBenefits.csv", na.strings=c("", " ", "NULL", NA))
Services <- read.csv("Services.csv", na.strings=c("", " ", "NULL", NA))
no_use_col <- c("DateCreated", "DateUpdated", "UserID", "ExportID", "ProjectEntryID")
# Since, these data sets are very imbalance, i will delete them
Client$First_Name <- NULL
Client$Middle_Name <- NULL
Client$Last_Name <- NULL
Client$SSN <- NULL
Client$OtherGender <- NULL
Client$Name_Data_Quality <- NULL
Client$SSNDataQuality <- NULL
Client$DOBDataQuality <- NULL
names(Client)[1] <- paste("PersonalID") # change UUID to PersonalID for joining dataset purpose
sapply(Client, function(x){sum(is.na(x))/length(x)})*100
## PersonalID DOB AmIndAKNative
## 0.000000 2.990033 0.000000
## Asian Black NativeHIOtherPacific
## 0.000000 0.000000 0.000000
## White RaceNone Gender
## 0.000000 0.000000 0.000000
## VeteranStatus YearEnteredService YearSeparated
## 0.000000 35.880399 35.880399
## WorldWarII KoreanWar VietnamWar
## 0.000000 0.000000 0.000000
## DesertStorm AfghanistanOEF IraqOIF
## 0.000000 0.000000 0.000000
## IraqOND OtherTheater MilitaryBranch
## 0.000000 0.000000 33.887043
## Discharge_Status
## 33.887043
# remove ~50% missing values.
cat_Disabilities <- sapply(Disabilities, function(x){sum(is.na(x))/length(x)})*100
Disabilities <- subset(Disabilities, select = cat_Disabilities < 5 )
Disabilities <- Disabilities[, !colnames(Disabilities) %in% no_use_col]
Disabilities$DisabilitiesID <- NULL
sapply(Disabilities, function(x){sum(is.na(x))/length(x)})*100
## PersonalID InformationDate DisabilityType
## 0 0 0
## DisabilityResponse DataCollectionStage
## 0 0
# remove ~50% missing values.
cat_EmploymentEducation <- sapply(EmploymentEducation, function(x){sum(is.na(x))/length(x)})*100
EmploymentEducation <- subset(EmploymentEducation, select = cat_EmploymentEducation < 5 )
EmploymentEducation <- EmploymentEducation[, !colnames(EmploymentEducation) %in% no_use_col]
EmploymentEducation$EmploymentEducationID <- NULL
sapply(EmploymentEducation, function(x){sum(is.na(x))/length(x)})*100
## PersonalID InformationDate Employed
## 0 0 0
## DataCollectionStage
## 0
# remove ~50% missing values.
cat_Enrollment <- sapply(Enrollment, function(x){sum(is.na(x))/length(x)})*100
Enrollment <- subset(Enrollment, select = cat_Enrollment < 5 )
Enrollment <- Enrollment[, !colnames(Enrollment) %in% no_use_col]
Enrollment$HouseholdID <- NULL
Enrollment$ProjectID <- NULL
sapply(Enrollment, function(x){sum(is.na(x))/length(x)})*100
## PersonalID EntryDate RelationshipToHoH
## 0.000000 0.000000 0.000000
## LastPermanentStreet LastPermanentCity LastPermanentState
## 4.532578 4.532578 4.532578
## LastPermanentZIP
## 4.532578
# remove ~50% missing values.
cat_Exit <- sapply(Exit, function(x){sum(is.na(x))/length(x)})*100
Exit <- subset(Exit, select = cat_Exit < 5 )
Exit <- Exit[, !colnames(Exit) %in% no_use_col]
Exit$ExitID <- NULL
sapply(Exit, function(x){sum(is.na(x))/length(x)})*100
## PersonalID ExitDate Destination
## 0.00 0.00 1.25
# remove ~50% missing values.
cat_HealthAndDV <- sapply(HealthAndDV, function(x){sum(is.na(x))/length(x)})*100
HealthAndDV <- subset(HealthAndDV, select = cat_HealthAndDV < 5 )
HealthAndDV <- HealthAndDV[, !colnames(HealthAndDV) %in% no_use_col]
HealthAndDV$HealthAndDVID <- NULL
HealthAndDV$DueDate <- NULL
sapply(HealthAndDV, function(x){sum(is.na(x))/length(x)})*100
## PersonalID InformationDate DomesticViolenceVictim
## 0 0 0
## DataCollectionStage
## 0
# remove ~50% missing values.
cat_IncomeBenefits <- sapply(IncomeBenefits, function(x){sum(is.na(x))/length(x)})*100
IncomeBenefits <- subset(IncomeBenefits, select = cat_IncomeBenefits < 5 )
IncomeBenefits <- IncomeBenefits[, !colnames(IncomeBenefits) %in% no_use_col]
IncomeBenefits$IncomeBenefitsID <- NULL
sapply(IncomeBenefits, function(x){sum(is.na(x))/length(x)})*100
## PersonalID InformationDate Earned
## 0.00000 0.00000 0.00000
## Unemployment SSI SSDI
## 0.00000 0.00000 0.00000
## VADisabilityService VADisabilityNonService PrivateDisability
## 0.00000 0.00000 0.00000
## WorkersComp TANF GA
## 0.00000 0.00000 0.00000
## SocSecRetirement Pension ChildSupport
## 0.00000 0.00000 0.00000
## Alimony OtherIncomeSource SNAP
## 0.00000 0.00000 0.00000
## WIC TANFChildCare TANFTransportation
## 0.00000 0.00000 0.00000
## OtherTANF RentalAssistanceOngoing RentalAssistanceTemp
## 0.00000 0.00000 0.00000
## OtherBenefitsSource InsuranceFromAnySource Medicaid
## 0.00000 2.29682 0.00000
## Medicare SCHIP VAMedicalServices
## 0.00000 0.00000 0.00000
## EmployerProvided COBRA PrivatePay
## 0.00000 0.00000 0.00000
## StateHealthIns DataCollectionStage
## 0.00000 0.00000
# remove ~50% missing values.
cat_Services <- sapply(Services, function(x){sum(is.na(x))/length(x)})*100
Services <- subset(Services, select = cat_Services < 5 )
Services <- Services[, !colnames(Services) %in% no_use_col]
Services$ServicesID <- NULL
sapply(Services, function(x){sum(is.na(x))/length(x)})*100
## PersonalID DateProvided RecordType TypeProvided
## 0 0 0 0
library(mlr)
library(lubridate) # for data
library(caret)
dim(Client)
## [1] 301 22
str(Client)
## 'data.frame': 301 obs. of 22 variables:
## $ PersonalID : int 90077 90703 90739 91161 91501 95126 95759 98250 99577 101863 ...
## $ DOB : chr "1/1/1953" "1/1/1953" "1/1/1971" "1/1/1952" ...
## $ AmIndAKNative : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Asian : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Black : int 1 1 1 1 1 1 1 1 1 1 ...
## $ NativeHIOtherPacific: int 0 0 0 0 0 0 0 0 0 0 ...
## $ White : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RaceNone : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gender : int 1 1 1 1 1 0 0 1 1 1 ...
## $ VeteranStatus : int 1 1 0 1 1 1 1 1 1 1 ...
## $ YearEnteredService : int 1971 1971 NA 1970 1974 2000 1982 1984 1980 1977 ...
## $ YearSeparated : int 1973 1973 NA 1973 1977 2003 1985 1987 1982 1983 ...
## $ WorldWarII : int 0 0 0 0 0 0 0 0 0 0 ...
## $ KoreanWar : int 0 0 0 0 0 0 0 0 0 0 ...
## $ VietnamWar : int 0 0 0 1 0 0 0 0 0 0 ...
## $ DesertStorm : int 0 0 1 0 0 0 0 0 0 0 ...
## $ AfghanistanOEF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ IraqOIF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ IraqOND : int 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherTheater : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MilitaryBranch : int 4 4 NA 1 2 2 1 1 4 1 ...
## $ Discharge_Status : int 2 1 NA 1 1 1 2 1 1 1 ...
summarizeColumns(Client)
## name type na mean disp median
## 1 PersonalID integer 0 2.243461e+05 4.778426e+04 247742
## 2 DOB character 9 NA NA NA
## 3 AmIndAKNative integer 0 6.644518e-03 8.137794e-02 0
## 4 Asian integer 0 0.000000e+00 0.000000e+00 0
## 5 Black integer 0 6.013289e-01 4.904402e-01 1
## 6 NativeHIOtherPacific integer 0 0.000000e+00 0.000000e+00 0
## 7 White integer 0 3.953488e-01 4.897397e-01 0
## 8 RaceNone integer 0 0.000000e+00 0.000000e+00 0
## 9 Gender integer 0 1.388704e+00 8.025692e+00 1
## 10 VeteranStatus integer 0 1.675415e+01 3.632950e+01 1
## 11 YearEnteredService integer 108 1.984699e+03 1.148064e+01 1982
## 12 YearSeparated integer 108 1.988679e+03 1.275655e+01 1986
## 13 WorldWarII integer 0 0.000000e+00 0.000000e+00 0
## 14 KoreanWar integer 0 0.000000e+00 0.000000e+00 0
## 15 VietnamWar integer 0 2.325581e-02 1.509659e-01 0
## 16 DesertStorm integer 0 2.657807e-02 1.611146e-01 0
## 17 AfghanistanOEF integer 0 2.657807e-02 1.611146e-01 0
## 18 IraqOIF integer 0 3.654485e-02 1.879540e-01 0
## 19 IraqOND integer 0 0.000000e+00 0.000000e+00 0
## 20 OtherTheater integer 0 0.000000e+00 0.000000e+00 0
## 21 MilitaryBranch integer 102 2.015075e+00 1.178414e+00 1
## 22 Discharge_Status integer 102 1.547739e+00 1.350826e+00 1
## mad min max nlevs
## 1 2314.3386 90077 252131 0
## 2 NA 1 15 69
## 3 0.0000 0 1 0
## 4 0.0000 0 0 0
## 5 0.0000 0 1 0
## 6 0.0000 0 0 0
## 7 0.0000 0 1 0
## 8 0.0000 0 0 0
## 9 0.0000 0 99 0
## 10 0.0000 0 99 0
## 11 10.3782 1954 2014 0
## 12 10.3782 1947 2015 0
## 13 0.0000 0 0 0
## 14 0.0000 0 0 0
## 15 0.0000 0 1 0
## 16 0.0000 0 1 0
## 17 0.0000 0 1 0
## 18 0.0000 0 1 0
## 19 0.0000 0 0 0
## 20 0.0000 0 0 0
## 21 0.0000 1 4 0
## 22 0.0000 1 7 0
# check if the ID is unique in the dataset
nrow(Client) == length(table(Client$PersonalID))
## [1] TRUE
# extracting the DBO year
Client$DOB <- as.POSIXct(strptime(Client$DOB, format="%m/%d/%Y"))
Client$DOByear <- year(Client$DOB)
Client$DOB <- NULL
# cleaning YearEnteredService and YearSeparated
# impute missing values by mean and mode
Client$YearEnteredService[is.na(as.numeric(Client$YearEnteredService))] <- mean(Client$YearEnteredService, na.rm=T)
Client$YearSeparated[is.na(as.numeric(Client$YearSeparated))] <- mean(Client$YearSeparated, na.rm=T)
# change to factor variables
non_fector = c("PersonalID", "Name_Data_Quality", "SSNDataQuality", "DOBDataQuality", "YearEnteredService", "YearSeparated", "DOByear")
for(i in names(Client[,!colnames(Client) %in% non_fector])) {
Client[,names(Client) == i ] <- as.factor(Client[,names(Client) == i ])
}
# cleaning Gender var
# Since Gender has very imbalance dataset, I'm going to bin minority genders
levels(Client$Gender)[levels(Client$Gender) %in% 2] <- 1
levels(Client$Gender)[levels(Client$Gender) %in% 3] <- 1
levels(Client$Gender)[levels(Client$Gender) %in% 9] <- 1
levels(Client$Gender)[levels(Client$Gender) %in% 99] <- 1
table(Client$Gender, useNA = "always")
##
## 0 1 <NA>
## 91 210 0
# cleaning MilitaryBranch var
# create another level of factor for missing data
Client$MilitaryBranch <- as.numeric(Client$MilitaryBranch)
Client$MilitaryBranch[is.na(Client$MilitaryBranch)] <- 0
Client$MilitaryBranch <- as.factor(Client$MilitaryBranch)
# Bin minority Military Branch factors
levels(Client$MilitaryBranch)[levels(Client$MilitaryBranch) %in% 3] <- 2
levels(Client$MilitaryBranch)[levels(Client$MilitaryBranch) %in% 4] <- 2
table(Client$MilitaryBranch, useNA = "always")
##
## 0 1 2 <NA>
## 102 103 96 0
# cleaning Discharge_Status var
# create another level of factor for missing data
Client$Discharge_Status <- as.numeric(Client$Discharge_Status)
Client$Discharge_Status[is.na(Client$Discharge_Status)] <- 0
Client$Discharge_Status <- as.factor(Client$Discharge_Status)
# Bin minority Discharge_Status fators
levels(Client$Discharge_Status)[levels(Client$Discharge_Status) %in% 3] <- 2
levels(Client$Discharge_Status)[levels(Client$Discharge_Status) %in% 4] <- 2
levels(Client$Discharge_Status)[levels(Client$Discharge_Status) %in% 5] <- 2
table(Client$Discharge_Status, useNA = "always")
##
## 0 1 2 <NA>
## 102 150 49 0
# cleaning DOByear var
Client$DOByear[is.na(Client$DOByear)] <- median(Client$DOByear, na.rm=T)
sapply(Client, function(x){sum(is.na(x))/length(x)})*100
## PersonalID AmIndAKNative Asian
## 0 0 0
## Black NativeHIOtherPacific White
## 0 0 0
## RaceNone Gender VeteranStatus
## 0 0 0
## YearEnteredService YearSeparated WorldWarII
## 0 0 0
## KoreanWar VietnamWar DesertStorm
## 0 0 0
## AfghanistanOEF IraqOIF IraqOND
## 0 0 0
## OtherTheater MilitaryBranch Discharge_Status
## 0 0 0
## DOByear
## 0
Client$Black <- NULL
Client$White <- NULL
# convert them into dummy variables
dummies <- dummyVars("~ Gender + VeteranStatus + MilitaryBranch + Discharge_Status", data=Client, fullRank=T)
dummies.df <- as.data.frame(predict(dummies, Client))
Client <- cbind(Client, dummies.df)
Client$Gender <- NULL
Client$VeteranStatus <- NULL
Client$MilitaryBranch <- NULL
Client$Discharge_Status <- NULL
# change binary flag factor into numerical value again
for(i in names(Client[,!colnames(Client) %in% non_fector])) {
Client[,names(Client) == i ] <- as.numeric(Client[,names(Client) == i ])
}
# remove zero variance
nzCol <- nearZeroVar(Client, saveMetrics = TRUE)
Client <- Client[, nzCol$nzv == FALSE]
# cleaning age
Client$Age <- 2016 - Client$DOByear
Client[Client$Age < 18, "Age"] <- round(mean(Client$Age))
for (i in seq(length(Client$Age)))
age <- paste(substr(Client$Age, start=0, stop=1))
Client$Age <- age
Client$YearEnteredService <- NULL
Client$YearSeparated <- NULL
Client$DOByear <- NULL
dim(Disabilities)
## [1] 3318 5
str(Disabilities)
## 'data.frame': 3318 obs. of 5 variables:
## $ PersonalID : int 90077 90077 90077 90077 90077 90077 90077 90077 90077 90077 ...
## $ InformationDate : Factor w/ 166 levels "1/11/2016","1/12/2016",..: 38 10 53 129 38 10 53 129 38 10 ...
## $ DisabilityType : int 5 5 5 5 6 6 6 6 7 7 ...
## $ DisabilityResponse : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DataCollectionStage: int 1 2 3 2 1 2 3 2 1 2 ...
summarizeColumns(Disabilities)
## name type na mean disp median
## 1 PersonalID integer 0 2.237205e+05 4.809816e+04 247333.0
## 2 InformationDate factor 0 NA 9.764919e-01 NA
## 3 DisabilityType integer 0 7.500000e+00 1.708083e+00 7.5
## 4 DisabilityResponse integer 0 2.546715e-01 5.913156e-01 0.0
## 5 DataCollectionStage integer 0 1.735986e+00 8.731280e-01 1.0
## mad min max nlevs
## 1 2453.7030 90077 252131 0
## 2 NA 6 78 166
## 3 2.2239 5 10 0
## 4 0.0000 0 3 0
## 5 0.0000 1 3 0
# Checking if the ID is unique in the dataset
nrow(Disabilities) == length(table(Disabilities$PersonalID))
## [1] FALSE
# extracting the year
Disabilities$InformationDate <- as.POSIXct(strptime(Disabilities$InformationDate, format="%m/%d/%Y"))
Disabilities$InformationDateMonth <- month(Disabilities$InformationDate)
Disabilities$InformationDate <- NULL
table(Disabilities$InformationDateYear, useNA = "always")
##
## <NA>
## 0
dim(EmploymentEducation)
## [1] 192 4
str(EmploymentEducation)
## 'data.frame': 192 obs. of 4 variables:
## $ PersonalID : int 117753 181183 117753 182784 90739 182784 106089 106089 240712 240713 ...
## $ InformationDate : Factor w/ 113 levels "1/13/2016","1/14/2016",..: 8 53 66 68 107 41 21 3 86 86 ...
## $ Employed : int 1 1 1 1 1 1 0 0 0 0 ...
## $ DataCollectionStage: int 1 1 3 1 1 3 1 3 1 1 ...
summarizeColumns(EmploymentEducation)
## name type na mean disp median
## 1 PersonalID integer 0 2.241574e+05 4.706670e+04 246509.5
## 2 InformationDate factor 0 NA 9.583333e-01 NA
## 3 Employed integer 0 6.583333e+00 3.013640e+00 8.0
## 4 DataCollectionStage integer 0 1.635417e+00 8.072939e-01 1.0
## mad min max nlevs
## 1 2104.551 90077 249591 0
## 2 NA 1 8 113
## 3 0.000 0 8 0
## 4 0.000 1 3 0
# Checking if the ID is unique in the dataset
nrow(EmploymentEducation) == length(table(EmploymentEducation$PersonalID))
## [1] FALSE
# extracting the year
EmploymentEducation$InformationDate <- as.POSIXct(strptime(EmploymentEducation$InformationDate, format="%m/%d/%Y"))
EmploymentEducation$InformationDateMonth <- month(EmploymentEducation$InformationDate)
EmploymentEducation$InformationDate <- NULL
dim(Enrollment)
## [1] 353 7
str(Enrollment)
## 'data.frame': 353 obs. of 7 variables:
## $ PersonalID : int 117753 117753 181183 182784 90739 106089 240712 240713 240713 240714 ...
## $ EntryDate : Factor w/ 130 levels "1/12/2016","1/13/2016",..: 11 11 72 89 124 21 105 105 105 105 ...
## $ RelationshipToHoH : int 1 1 1 1 1 1 1 2 2 2 ...
## $ LastPermanentStreet: Factor w/ 225 levels "0","1 Jefferson Barracks Drive",..: 81 112 34 97 181 103 74 73 73 72 ...
## $ LastPermanentCity : Factor w/ 28 levels "BELLEVILLE","BISMARCK",..: 21 21 21 21 21 21 21 21 21 21 ...
## $ LastPermanentState : Factor w/ 3 levels "IL","MO","ZZ": 2 2 2 2 2 2 2 2 2 2 ...
## $ LastPermanentZIP : int 63118 63118 63138 63118 63104 63118 63135 63135 63135 63135 ...
summarizeColumns(Enrollment)
## name type na mean disp median
## 1 PersonalID integer 0 2.219889e+05 4.961181e+04 247742
## 2 EntryDate factor 0 NA 9.688385e-01 NA
## 3 RelationshipToHoH integer 0 1.518414e+00 1.000185e+00 1
## 4 LastPermanentStreet factor 16 NA NA NA
## 5 LastPermanentCity factor 16 NA NA NA
## 6 LastPermanentState factor 16 NA NA NA
## 7 LastPermanentZIP integer 16 6.069350e+04 1.217828e+04 63114
## mad min max nlevs
## 1 2314.3386 90077 252131 0
## 2 NA 1 11 130
## 3 0.0000 1 5 0
## 4 NA 1 20 225
## 5 NA 1 242 28
## 6 NA 13 311 3
## 7 19.2738 0 65401 0
# Checking if the ID is unique in the dataset
nrow(Enrollment) == length(table(Enrollment$PersonalID))
## [1] FALSE
# extracting the year
Enrollment$EntryDate <- as.POSIXct(strptime(Enrollment$EntryDate, format="%m/%d/%Y"))
Enrollment$EntryDateMonth <- month(Enrollment$EntryDate)
# extracing geo information through the zipcode
library(zipcode)
data(zipcode)
Enrollment <- merge(Enrollment, zipcode, by.x='LastPermanentZIP', by.y='zip')
Enrollment$LastPermanentStreet <- NULL
Enrollment$LastPermanentCity <- NULL
Enrollment$LastPermanentState <- NULL
Enrollment$city <- NULL
Enrollment$state <- NULL
# bin small dataset
Enrollment[Enrollment$RelationshipToHoH == 4, 'RelationshipToHoH'] <- 3
Enrollment[Enrollment$RelationshipToHoH == 5, 'RelationshipToHoH'] <- 3
table(Enrollment$RelationshipToHoH)
##
## 1 2 3
## 246 40 38
Enrollment$RelationshipToHoH <- as.factor(Enrollment$RelationshipToHoH)
dummies <- dummyVars("~ RelationshipToHoH", data=Enrollment, fullRank=T)
dummies.df <- as.data.frame(predict(dummies, Enrollment))
Enrollment <- cbind(Enrollment, dummies.df)
Enrollment$RelationshipToHoH <- NULL
dim(Exit)
## [1] 160 3
str(Exit)
## 'data.frame': 160 obs. of 3 variables:
## $ PersonalID : int 117753 181183 182784 106089 240712 174821 240942 90077 243203 245519 ...
## $ ExitDate : Factor w/ 62 levels "1/11/2016","1/12/2016",..: 61 58 41 5 4 17 17 29 4 56 ...
## $ Destination: int 19 NA 10 22 10 10 10 10 10 22 ...
summarizeColumns(Exit)
## name type na mean disp median mad min
## 1 PersonalID integer 0 222930.31875 47841.456123 247332.5 2226.865 90077
## 2 ExitDate factor 0 NA 0.937500 NA NA 1
## 3 Destination integer 2 12.48734 5.186628 10.0 0.000 2
## max nlevs
## 1 252015 0
## 2 10 62
## 3 23 0
# Checking if the ID is unique in the dataset
nrow(Exit) == length(table(Exit$PersonalID))
## [1] FALSE
# extracting the year
Exit$ExitDate <- as.POSIXct(strptime(Exit$ExitDate, format="%m/%d/%Y"))
Exit$ExitDateMonth <- month(Exit$ExitDate)
table(Exit$Destination, useNA = "always")
##
## 2 3 7 9 10 11 12 13 16 19 20 22 23 <NA>
## 4 7 2 1 88 3 2 2 1 37 2 8 1 2
Exit$Destination[is.na(Exit$Destination)] <- 2
# Bin some data and fit them in to 0 to 5
Exit$Destination <- as.numeric(Exit$Destination)
Exit[Exit$Destination == 7, 'Destination'] <- 1
Exit[Exit$Destination == 16, 'Destination'] <- 1
Exit[Exit$Destination == 9, 'Destination'] <- 1
Exit[Exit$Destination == 14, 'Destination'] <- 1
Exit[Exit$Destination == 2, 'Destination'] <- 1
Exit[Exit$Destination == 25, 'Destination'] <- 2
Exit[Exit$Destination == 3, 'Destination'] <- 2
Exit[Exit$Destination == 22, 'Destination'] <- 2
Exit[Exit$Destination == 23, 'Destination'] <- 3
Exit[Exit$Destination == 19, 'Destination'] <- 3
Exit[Exit$Destination == 20, 'Destination'] <- 4
Exit[Exit$Destination == 21, 'Destination'] <- 4
Exit[Exit$Destination == 12, 'Destination'] <- 4
Exit[Exit$Destination == 13, 'Destination'] <- 5
Exit[Exit$Destination == 10, 'Destination'] <- 5
Exit[Exit$Destination == 11, 'Destination'] <- 5
dim(HealthAndDV)
## [1] 430 4
str(HealthAndDV)
## 'data.frame': 430 obs. of 4 variables:
## $ PersonalID : int 117753 181183 117753 182784 90739 182784 181183 106089 106089 240712 ...
## $ InformationDate : Factor w/ 164 levels "1/11/2016","1/12/2016",..: 16 93 113 115 156 71 101 30 5 135 ...
## $ DomesticViolenceVictim: int 0 0 0 0 0 0 0 0 0 0 ...
## $ DataCollectionStage : int 1 1 3 1 1 3 3 1 3 1 ...
summarizeColumns(HealthAndDV)
## name type na mean disp median
## 1 PersonalID integer 0 2.189243e+05 5.215057e+04 247069
## 2 InformationDate factor 0 NA 9.767442e-01 NA
## 3 DomesticViolenceVictim integer 0 7.093023e-01 2.124704e+00 0
## 4 DataCollectionStage integer 0 1.586047e+00 7.996448e-01 1
## mad min max nlevs
## 1 3048.967 90077 252131 0
## 2 NA 1 10 164
## 3 0.000 0 8 0
## 4 0.000 1 3 0
# Checking if the ID is unique in the dataset
nrow(HealthAndDV) == length(table(HealthAndDV$PersonalID))
## [1] FALSE
HealthAndDV$InformationDate <- NULL
dim(IncomeBenefits)
## [1] 566 35
str(IncomeBenefits)
## 'data.frame': 566 obs. of 35 variables:
## $ PersonalID : int 117753 181183 117753 182784 90739 182784 181183 106089 106089 240712 ...
## $ InformationDate : Factor w/ 170 levels "1/11/2016","1/12/2016",..: 19 99 119 121 162 76 107 33 5 141 ...
## $ Earned : int 1 1 1 1 1 1 1 0 0 0 ...
## $ Unemployment : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SSI : int 0 0 0 0 0 0 0 1 1 0 ...
## $ SSDI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ VADisabilityService : int 0 0 0 0 0 0 0 0 0 1 ...
## $ VADisabilityNonService : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PrivateDisability : int 0 0 0 0 0 0 0 0 0 0 ...
## $ WorkersComp : int 0 0 0 0 0 0 0 0 0 0 ...
## $ TANF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SocSecRetirement : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Pension : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ChildSupport : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Alimony : int 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherIncomeSource : int 1 0 1 0 0 0 0 0 0 0 ...
## $ SNAP : int 1 0 1 0 0 0 0 0 1 1 ...
## $ WIC : int 0 0 0 0 0 0 0 0 0 1 ...
## $ TANFChildCare : int 0 0 0 0 0 0 0 0 0 0 ...
## $ TANFTransportation : int 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherTANF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RentalAssistanceOngoing: int 0 0 0 0 0 0 0 0 0 0 ...
## $ RentalAssistanceTemp : int 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherBenefitsSource : int 0 0 0 0 0 0 0 0 0 0 ...
## $ InsuranceFromAnySource : int NA NA NA 0 0 1 0 0 NA 1 ...
## $ Medicaid : int 0 0 0 0 0 0 0 1 0 1 ...
## $ Medicare : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SCHIP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ VAMedicalServices : int 1 0 1 0 0 0 0 1 0 1 ...
## $ EmployerProvided : int 0 0 0 0 0 0 0 0 0 0 ...
## $ COBRA : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PrivatePay : int 0 0 0 0 0 0 0 0 0 0 ...
## $ StateHealthIns : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DataCollectionStage : int 1 1 3 1 1 3 3 1 3 1 ...
summarizeColumns(IncomeBenefits)
## name type na mean disp median
## 1 PersonalID integer 0 2.234312e+05 4.837040e+04 247333
## 2 InformationDate factor 0 NA 9.770318e-01 NA
## 3 Earned integer 0 2.508834e-01 4.339050e-01 0
## 4 Unemployment integer 0 1.766784e-02 1.318575e-01 0
## 5 SSI integer 0 1.183746e-01 3.233369e-01 0
## 6 SSDI integer 0 1.095406e-01 3.125926e-01 0
## 7 VADisabilityService integer 0 1.219081e-01 3.274691e-01 0
## 8 VADisabilityNonService integer 0 3.886926e-02 1.934543e-01 0
## 9 PrivateDisability integer 0 1.766784e-03 4.203314e-02 0
## 10 WorkersComp integer 0 0.000000e+00 0.000000e+00 0
## 11 TANF integer 0 3.533569e-02 1.847902e-01 0
## 12 GA integer 0 0.000000e+00 0.000000e+00 0
## 13 SocSecRetirement integer 0 1.060071e-02 1.025031e-01 0
## 14 Pension integer 0 1.590106e-02 1.252035e-01 0
## 15 ChildSupport integer 0 1.060071e-02 1.025031e-01 0
## 16 Alimony integer 0 0.000000e+00 0.000000e+00 0
## 17 OtherIncomeSource integer 0 2.473498e-02 1.554537e-01 0
## 18 SNAP integer 0 3.250883e-01 4.688222e-01 0
## 19 WIC integer 0 1.590106e-02 1.252035e-01 0
## 20 TANFChildCare integer 0 5.300353e-03 7.267456e-02 0
## 21 TANFTransportation integer 0 0.000000e+00 0.000000e+00 0
## 22 OtherTANF integer 0 3.533569e-03 5.939120e-02 0
## 23 RentalAssistanceOngoing integer 0 8.833922e-03 9.365565e-02 0
## 24 RentalAssistanceTemp integer 0 0.000000e+00 0.000000e+00 0
## 25 OtherBenefitsSource integer 0 1.060071e-02 1.025031e-01 0
## 26 InsuranceFromAnySource integer 13 3.218807e-01 4.676205e-01 0
## 27 Medicaid integer 0 6.713781e-02 2.504819e-01 0
## 28 Medicare integer 0 2.120141e-02 1.441827e-01 0
## 29 SCHIP integer 0 5.123675e-02 2.206753e-01 0
## 30 VAMedicalServices integer 0 4.593640e-02 2.095324e-01 0
## 31 EmployerProvided integer 0 1.413428e-02 1.181489e-01 0
## 32 COBRA integer 0 0.000000e+00 0.000000e+00 0
## 33 PrivatePay integer 0 1.766784e-03 4.203314e-02 0
## 34 StateHealthIns integer 0 1.060071e-02 1.025031e-01 0
## 35 DataCollectionStage integer 0 1.742049e+00 8.686672e-01 1
## mad min max nlevs
## 1 2570.828 90077 252131 0
## 2 NA 1 13 170
## 3 0.000 0 1 0
## 4 0.000 0 1 0
## 5 0.000 0 1 0
## 6 0.000 0 1 0
## 7 0.000 0 1 0
## 8 0.000 0 1 0
## 9 0.000 0 1 0
## 10 0.000 0 0 0
## 11 0.000 0 1 0
## 12 0.000 0 0 0
## 13 0.000 0 1 0
## 14 0.000 0 1 0
## 15 0.000 0 1 0
## 16 0.000 0 0 0
## 17 0.000 0 1 0
## 18 0.000 0 1 0
## 19 0.000 0 1 0
## 20 0.000 0 1 0
## 21 0.000 0 0 0
## 22 0.000 0 1 0
## 23 0.000 0 1 0
## 24 0.000 0 0 0
## 25 0.000 0 1 0
## 26 0.000 0 1 0
## 27 0.000 0 1 0
## 28 0.000 0 1 0
## 29 0.000 0 1 0
## 30 0.000 0 1 0
## 31 0.000 0 1 0
## 32 0.000 0 0 0
## 33 0.000 0 1 0
## 34 0.000 0 1 0
## 35 0.000 1 3 0
# Checking if the ID is unique in the dataset
nrow(IncomeBenefits) == length(table(IncomeBenefits$PersonalID))
## [1] FALSE
# fil missing data with median
IncomeBenefits$InsuranceFromAnySource[is.na(IncomeBenefits$InsuranceFromAnySource)] <- 0
# remove zero variance
nzCol <- nearZeroVar(IncomeBenefits, saveMetrics = TRUE)
IncomeBenefits <- IncomeBenefits[, nzCol$nzv == FALSE]
IncomeBenefits$InformationDate <- NULL
dim(Services)
## [1] 2684 4
str(Services)
## 'data.frame': 2684 obs. of 4 variables:
## $ PersonalID : int 247032 248537 248586 248537 248608 189603 189603 249591 189044 248812 ...
## $ DateProvided: Factor w/ 115 levels "1/1/2016","1/10/2016",..: 44 46 28 29 27 32 38 40 40 41 ...
## $ RecordType : int 144 144 144 144 144 144 152 152 144 144 ...
## $ TypeProvided: int 2 2 2 6 2 2 12 1 6 2 ...
summarizeColumns(Services)
## name type na mean disp median mad min
## 1 PersonalID integer 0 2.153891e+05 54211.398580 247332 2910.344 90077
## 2 DateProvided factor 0 NA 0.976155 NA NA 1
## 3 RecordType integer 0 1.455410e+02 3.155463 144 0.000 144
## 4 TypeProvided integer 0 2.934426e+00 2.370339 2 0.000 1
## max nlevs
## 1 252131 0
## 2 64 115
## 3 152 0
## 4 14 0
# Checking if the ID is unique in the dataset
nrow(Services) == length(table(Services$PersonalID))
## [1] FALSE
# binding two service groups into a unique identification
Services[Services$RecordType == 144, "RecordType"] = "A"
Services[Services$RecordType == 152, "RecordType"] = "B"
Services$TypeProvided <- paste0(Services$RecordType, as.character(Services$TypeProvided))
# bind small data together
Services[Services$TypeProvided == "A3", "TypeProvided"] <- "A5"
Services[Services$TypeProvided == "B10", "TypeProvided"] <- "B4"
Services[Services$TypeProvided == "B11", "TypeProvided"] <- "B4"
Services[Services$TypeProvided == "B14", "TypeProvided"] <- "B4"
Services[Services$TypeProvided == "B3", "TypeProvided"] <- "B4"
Services[Services$TypeProvided == "B5", "TypeProvided"] <- "B4"
Services$RecordType <- NULL
write.csv(Client, file = "Client_Cleaned.csv",row.names=FALSE)
write.csv(Disabilities, file = "Disabilities_Cleaned.csv",row.names=FALSE)
write.csv(EmploymentEducation, file = "EmploymentEducation_Cleaned.csv",row.names=FALSE)
write.csv(Enrollment, file = "Enrollment_Cleaned.csv",row.names=FALSE)
write.csv(Exit, file = "Exit_Cleaned.csv",row.names=FALSE)
write.csv(HealthAndDV, file = "HealthAndDV_Cleaned.csv",row.names=FALSE)
write.csv(IncomeBenefits, file = "IncomeBenefits_Cleaned.csv",row.names=FALSE)
write.csv(Services, file = "Services_Cleaned.csv",row.names=FALSE)
setwd("C:/Users/Kyu/Google Drive/Hackathon/data")
Client <- read.csv("Client_Cleaned.csv", na.strings=c("", " ", "NULL", NA), stringsAsFactors=FALSE)
Disabilities <- read.csv("Disabilities_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
EmploymentEducation <- read.csv("EmploymentEducation_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
Enrollment <- read.csv("Enrollment_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
Exit <- read.csv("Exit_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
HealthAndDV <- read.csv("HealthAndDV_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
IncomeBenefits <- read.csv("IncomeBenefits_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
Services <- read.csv("Services_Cleaned.csv", na.strings=c("", " ", "NULL", NA))
# Geo Map
library(maps)
library(ggmap)
library(dplyr)
# Load a map of STL into R:
STL <- get_map(location="Saint Louis", zoom=11)
LatLonCounts <- as.data.frame(table(round(Enrollment$longitude,2), round(Enrollment$latitude,2)))
LatLonCounts$Long <- as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat <- as.numeric(as.character(LatLonCounts$Var2))
g1 <- ggmap(STL) +
geom_point(data=LatLonCounts, aes(x=Long, y=Lat, color=Freq, size=Freq)) +
scale_colour_gradient(low="yellow", high="red")
g1
g2 <- ggmap(STL) +
stat_density2d(data=Enrollment, aes(x=longitude, y=latitude, fill=..level..), geom="polygon", alpha=0.2) +
scale_fill_gradient(low="yellow", high="red")
g2
# top 10 zipcodes with most homeless people
top10_zip <- Enrollment %>%
group_by(LastPermanentZIP) %>%
count() %>%
top_n(10, n) %>%
arrange(-n)
library(dplyr)
# remove all the time variables
str(Client)
## 'data.frame': 301 obs. of 9 variables:
## $ PersonalID : int 90077 90703 90739 91161 91501 95126 95759 98250 99577 101863 ...
## $ Gender.1 : int 1 1 1 1 1 0 0 1 1 1 ...
## $ VeteranStatus.1 : int 1 1 0 1 1 1 1 1 1 1 ...
## $ VeteranStatus.99 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ MilitaryBranch.1 : int 0 0 0 1 0 0 1 1 0 1 ...
## $ MilitaryBranch.2 : int 1 1 0 0 1 1 0 0 1 0 ...
## $ Discharge_Status.1: int 0 1 0 1 1 1 0 1 1 1 ...
## $ Discharge_Status.2: int 1 0 0 0 0 0 1 0 0 0 ...
## $ Age : int 6 6 4 6 4 3 5 5 5 5 ...
str(Disabilities)
## 'data.frame': 3318 obs. of 5 variables:
## $ PersonalID : int 90077 90077 90077 90077 90077 90077 90077 90077 90077 90077 ...
## $ DisabilityType : int 5 5 5 5 6 6 6 6 7 7 ...
## $ DisabilityResponse : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DataCollectionStage : int 1 2 3 2 1 2 3 2 1 2 ...
## $ InformationDateMonth: int 10 1 11 6 10 1 11 6 10 1 ...
Disabilities$InformationDateMonth <- NULL
str(EmploymentEducation)
## 'data.frame': 192 obs. of 4 variables:
## $ PersonalID : int 117753 181183 117753 182784 90739 182784 106089 106089 240712 240713 ...
## $ Employed : int 1 1 1 1 1 1 0 0 0 0 ...
## $ DataCollectionStage : int 1 1 3 1 1 3 1 3 1 1 ...
## $ InformationDateMonth: int 1 2 4 5 8 12 10 1 7 7 ...
EmploymentEducation$DataCollectionStage <- NULL
EmploymentEducation$InformationDateMonth <- NULL
str(Enrollment)
## 'data.frame': 324 obs. of 8 variables:
## $ LastPermanentZIP : int 62040 62201 62202 62202 62202 62202 62202 62202 62202 62205 ...
## $ PersonalID : int 248176 249588 143804 249804 152213 174662 249797 249873 95126 249975 ...
## $ EntryDate : Factor w/ 130 levels "2012-01-05","2012-02-02",..: 57 114 120 118 120 120 118 120 118 108 ...
## $ EntryDateMonth : int 9 1 2 1 2 2 1 2 1 1 ...
## $ latitude : num 38.7 38.6 38.6 38.6 38.6 ...
## $ longitude : num -90.1 -90.1 -90.2 -90.2 -90.2 ...
## $ RelationshipToHoH.2: int 1 0 1 1 1 0 1 1 0 0 ...
## $ RelationshipToHoH.3: int 0 0 0 0 0 0 0 0 0 0 ...
Enrollment$LastPermanentZIP <- NULL
Enrollment$EntryDate <- NULL
Enrollment$longitude <- NULL
Enrollment$latitude <- NULL
str(Exit)
## 'data.frame': 160 obs. of 4 variables:
## $ PersonalID : int 117753 181183 182784 106089 240712 174821 240942 90077 243203 245519 ...
## $ ExitDate : Factor w/ 62 levels "2012-04-05","2013-01-16",..: 1 62 30 2 44 3 3 18 44 60 ...
## $ Destination : int 3 1 5 2 5 5 5 5 5 2 ...
## $ ExitDateMonth: int 4 2 12 1 1 1 1 11 1 2 ...
Exit$ExitDate <- NULL
Exit$ExitDateMonth <- NULL
str(HealthAndDV)
## 'data.frame': 430 obs. of 3 variables:
## $ PersonalID : int 117753 181183 117753 182784 90739 182784 181183 106089 106089 240712 ...
## $ DomesticViolenceVictim: int 0 0 0 0 0 0 0 0 0 0 ...
## $ DataCollectionStage : int 1 1 3 1 1 3 3 1 3 1 ...
HealthAndDV$DataCollectionStage <- NULL
str(IncomeBenefits)
## 'data.frame': 566 obs. of 10 variables:
## $ PersonalID : int 117753 181183 117753 182784 90739 182784 181183 106089 106089 240712 ...
## $ Earned : int 1 1 1 1 1 1 1 0 0 0 ...
## $ SSI : int 0 0 0 0 0 0 0 1 1 0 ...
## $ SSDI : int 0 0 0 0 0 0 0 0 0 0 ...
## $ VADisabilityService : int 0 0 0 0 0 0 0 0 0 1 ...
## $ SNAP : int 1 0 1 0 0 0 0 0 1 1 ...
## $ InsuranceFromAnySource: int 0 0 0 0 0 1 0 0 0 1 ...
## $ Medicaid : int 0 0 0 0 0 0 0 1 0 1 ...
## $ SCHIP : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DataCollectionStage : int 1 1 3 1 1 3 3 1 3 1 ...
IncomeBenefits$DataCollectionStage <- NULL
str(Services)
## 'data.frame': 2684 obs. of 3 variables:
## $ PersonalID : int 247032 248537 248586 248537 248608 189603 189603 249591 189044 248812 ...
## $ DateProvided: Factor w/ 115 levels "1/1/2016","1/10/2016",..: 44 46 28 29 27 32 38 40 40 41 ...
## $ TypeProvided: Factor w/ 9 levels "A2","A4","A5",..: 1 1 1 4 1 1 6 5 4 1 ...
Services$DateProvided <- NULL
# Save as ML dataset
write.csv(Client, file = "Client_Cleaned_ML.csv",row.names=FALSE)
write.csv(Disabilities, file = "Disabilities_Cleaned_ML.csv",row.names=FALSE)
write.csv(EmploymentEducation, file = "EmploymentEducation_Cleaned_ML.csv",row.names=FALSE)
write.csv(Enrollment, file = "Enrollment_Cleaned_ML.csv",row.names=FALSE)
write.csv(Exit, file = "Exit_Cleaned_ML.csv",row.names=FALSE)
write.csv(HealthAndDV, file = "HealthAndDV_Cleaned_ML.csv",row.names=FALSE)
write.csv(IncomeBenefits, file = "IncomeBenefits_Cleaned_ML.csv",row.names=FALSE)
write.csv(Services, file = "Services_Cleaned_ML.csv",row.names=FALSE)
dim(Disabilities)
## [1] 3318 4
length(unique(Disabilities[ ,"PersonalID"]))
## [1] 301
dim(Services)
## [1] 2684 2
length(unique(Services[ ,"PersonalID"]))
## [1] 201
ml_data <- right_join(Disabilities, Services, by = "PersonalID")
dim(IncomeBenefits)
## [1] 566 9
length(unique(IncomeBenefits[ ,"PersonalID"]))
## [1] 301
ml_data <- right_join(ml_data, IncomeBenefits, by = "PersonalID")
dim(HealthAndDV)
## [1] 430 2
length(unique(HealthAndDV[ ,"PersonalID"]))
## [1] 259
ml_data <- right_join(ml_data, HealthAndDV, by = "PersonalID")
dim(Enrollment)
## [1] 324 4
length(unique(Enrollment[ ,"PersonalID"]))
## [1] 275
ml_data <- right_join(ml_data, Enrollment, by = "PersonalID")
dim(Client)
## [1] 301 9
length(unique(Client[ ,"PersonalID"]))
## [1] 301
ml_data <- right_join(ml_data, Client, by = "PersonalID")
dim(EmploymentEducation)
## [1] 192 2
length(unique(EmploymentEducation[ ,"PersonalID"]))
## [1] 110
ml_data <- right_join(ml_data, EmploymentEducation, by = "PersonalID")
dim(Exit)
## [1] 160 2
length(unique(Exit[ ,"PersonalID"]))
## [1] 158
ml_data <- right_join(ml_data, Exit, by = "PersonalID")
library(caTools)
# ml_data$Destination <- as.factor(ml_data$Destination)
# drop na values
ml_data <- ml_data[complete.cases(ml_data),]
round(sapply(ml_data, function(x){sum(is.na(x))/length(x)})*100, 3)
## PersonalID DisabilityType DisabilityResponse
## 0 0 0
## DataCollectionStage TypeProvided Earned
## 0 0 0
## SSI SSDI VADisabilityService
## 0 0 0
## SNAP InsuranceFromAnySource Medicaid
## 0 0 0
## SCHIP DomesticViolenceVictim EntryDateMonth
## 0 0 0
## RelationshipToHoH.2 RelationshipToHoH.3 Gender.1
## 0 0 0
## VeteranStatus.1 VeteranStatus.99 MilitaryBranch.1
## 0 0 0
## MilitaryBranch.2 Discharge_Status.1 Discharge_Status.2
## 0 0 0
## Age Employed Destination
## 0 0 0
ml_data$PersonalID <- NULL
ml_data$TypeProvided <- as.factor(ml_data$TypeProvided)
# save combined data
write.csv(ml_data, file = "ml_data.csv",row.names=FALSE)
# Spliting data
subsetD <- ml_data
# subsetD <- sample(ml_data)[0:5000,]
set.seed(2000)
split <- sample.split(subsetD$Destination, SplitRatio=0.7)
train <- subset(subsetD, split==TRUE)
test <- subset(subsetD, split==FALSE)
# load library for machine learning
library(mlr)
library(FSelector)
# create task
train.task <- makeClassifTask(data=train, target="Destination")
test.task <- makeClassifTask(data=test, target="Destination")
# remove zero variance features
train.task <- removeConstantFeatures(train.task)
## Removing 3 columns: RelationshipToHoH.2,RelationshipToHoH.3,VeteranStatus.99
test.task <- removeConstantFeatures(test.task)
## Removing 3 columns: RelationshipToHoH.2,RelationshipToHoH.3,VeteranStatus.99
# get variable importance chart
var_imp <- generateFilterValuesData(train.task, method=c("information.gain"))
plotFilterValues(var_imp, feat.type.cols=TRUE)
# select only important variables
imp_feat <- (var_imp$data %>% arrange(-information.gain) %>% top_n(7))$name
## Selecting by information.gain
imp_feat <- c(imp_feat, "Destination") # add target variable
# create task
train <- train[ ,colnames(train) %in% imp_feat]
test <- test[ ,colnames(test) %in% imp_feat]
trainTask <- makeClassifTask(data=train, target="Destination")
testTask <- makeClassifTask(data=test, target="Destination")
# getParamSet("classif.randomForest")
# create a learner
# rf <- makeLearner("classif.randomForest", predict.type="response", par.vals=list(ntree=200, mtry=3))
# rf$par.vals <- list(importance=TRUE)
#
# # set tunable parameters
# # grid search to find hyperparameters
# rf_param <- makeParamSet(makeIntegerParam("ntree", lower=5, upper=10),
# makeIntegerParam("mtry", lower=5, upper=10),
# makeIntegerParam("nodesize", lower=25, upper=50))
#
# # random search for 50 iterations
# rancontrol <- makeTuneControlRandom(maxit=50L)
#
# # set 3 fold cross validation
# set_cv <- makeResampleDesc("CV", iters=3L)
#
# # hypertuning
# rf_tune <- tuneParams(learner=rf, resampling=set_cv, task=train.task, par.set=rf_param, control=rancontrol, measures=acc)
library(randomForest)
#best parameters
# rf_tune$x
# $ntree
# [1] 9
#
# $mtry
# [1] 5
#
# $nodesize
# [1] 40
# cv accuracy
# rf_tune$y
# acc.test.mean
# 0.8829761
# using hyperparameters for modeling
# rf.tree <- setHyperPars(rf, par.vals=rf_tune$x)
# train a model
# rforest <- train(rf.tree, trainTask)
# getLearnerModel(t.rpart)
# make predictions
# rfmodel <- predict(rforest, test.task)
mdl.rf <- randomForest(as.factor(Destination) ~ .,
data = train,
replace = F, ntree = 9,
do.trace = F, mtry = 5, nodesize = 40)
pred.rf.test <- predict(mdl.rf, test)
conf.mtx <- confusionMatrix(pred.rf.test, test$Destination)
conf.mtx
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5
## 1 8864 0 8994 0 0
## 2 0 294 0 0 0
## 3 9957 2449 19056 0 153
## 4 0 0 0 3991 0
## 5 0 0 73 0 107008
##
## Overall Statistics
##
## Accuracy : 0.8655
## 95% CI : (0.8639, 0.8672)
## No Information Rate : 0.6663
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7355
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.47096 0.107182 0.6776 1.00000 0.9986
## Specificity 0.93667 1.000000 0.9054 1.00000 0.9986
## Pos Pred Value 0.49636 1.000000 0.6028 1.00000 0.9993
## Neg Pred Value 0.93036 0.984746 0.9298 1.00000 0.9972
## Prevalence 0.11702 0.017054 0.1749 0.02481 0.6663
## Detection Rate 0.05511 0.001828 0.1185 0.02481 0.6653
## Detection Prevalence 0.11103 0.001828 0.1966 0.02481 0.6658
## Balanced Accuracy 0.70382 0.553591 0.7915 1.00000 0.9986
save(mdl.rf, file = "mdl.rf.RData")
save(conf.mtx, file = "conf.mtx.RData")
save(test, file = "test.RData")
save(top10_zip, file = "top10_zip.RData")
save(g1, file = "g1.RData")
save(g2, file = "g2.RData")
save(var_imp, file = "var_imp.RData")
# library("RMySQL")
#
# # GET THAT DATABASE CONNECTION
# mydb <- dbConnect(RMySQL::MySQL(), user='GHack', password='GlobalHack123!',
# dbname='globalhack', host='Globalhack.il1.rdbs.ctl.io', port=49424)
#
# # FETCH NAMES & ADDRESSES!
# result <- dbSendQuery(mydb, "select *
# from CleanClient c
# LEFT JOIN (select * from CleanDisabilities group by CleanDisabilities.PersonalID) d on c.PersonalID = d.PersonalID
# LEFT JOIN (select * from CleanEmploymentEducation group by PersonalID) e on c.PersonalID = e.PersonalID
# LEFT JOIN (select * from CleanExitData group by PersonalID) x on c.PersonalID = x.PersonalID
# LEFT JOIN (select * from CleanHealthAndDV group by PersonalID) h on c.PersonalID = h.PersonalID
# LEFT JOIN (select * from CleanIncomeBenefits group by PersonalID) i on c.PersonalID = i.PersonalID
# LEFT JOIN (select * from CleanServices group by PersonalID) s on c.PersonalID = s.PersonalID
# LEFT JOIN (select * from CleanEnrollment group by PersonalID) r on c.PersonalID = r.PersonalID")
# names <- fetch(result, n=-1)
# dbClearResult(result)